library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(readxl)
library(here)
## here() starts at C:/Users/Malcolm/Documents/Code/GitHub/dirty_data_codeclan_project_mcheyne/Task 3 Sea bird observation data
clean_data <- read_csv(here("clean_data/seabirds_cleaned_data.csv"))
## Rows: 49020 Columns: 52
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (19): common_name, scientific_name, species_abbreviation, age, plphase,...
## dbl (28): record_x, record_id, wanplum, total_sighting, num_feeding, num_on...
## lgl (3): sex, air_temp, salinity
## dttm (2): date, time
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
clean_data
Clean the data for from the Sea bird observation to answer the questions
Took out any recorded with “NO BIRDS RECORDED”
Took out sensu, lato (unidentified) or upper case letters at the end of common_name as these where not part of the name buy descriptions of the birds.
Have added Ext to the questions as not sure your definition of type and species ie Royal / Wandering albatross and Black-browed albatross are different birds or as there all the same species ie just count all as just albatross. My first answer is with the former, the Ext parts have them as the latter.
Use a script file to clean
full_join() the 2 data sheets so not to loses any data
clean_names()
Renamed the columns for easier reading
recode() the data for easier reading
Removed descriptions of the birds form the common_name
Write to a .csv file
seabirds <- clean_data %>%
select(record_id, common_name, scientific_name,
species_abbreviation, total_sighting, num_group_sighting,
lat)
seabirds
seabirds %>%
group_by(common_name) %>%
summarise(count = sum(n())) %>%
arrange(desc(count))
# Wandering albatross 11293
seabirds %>%
group_by(common_name) %>%
filter(str_detect(common_name, "(?i)albatross")) %>%
ungroup() %>%
summarise(count = sum(total_sighting, na.rm = TRUE))
# All albatross 30424
seabirds %>%
group_by(common_name) %>%
summarise(count = sum(total_sighting, na.rm = TRUE)) %>%
arrange(desc(count))
# Short-tailed shearwater 982553
seabirds %>%
group_by(common_name) %>%
filter(str_detect(common_name, "(?i)shearwater")) %>%
ungroup() %>%
summarise(count = sum(total_sighting, na.rm = TRUE))
# All shearwater 1394468
seabirds %>%
group_by(common_name) %>%
filter(lat > -30) %>%
summarise(count = sum(total_sighting, na.rm = TRUE)) %>%
arrange(desc(count))
# Taking above a latitude of -30 as nearer the equator ie 0
# Wedge-tailed shearwater 855
seabirds %>%
group_by(common_name) %>%
filter(str_detect(common_name, "(?i)shearwater")) %>%
filter(lat > -30) %>%
ungroup() %>%
summarise(count = sum(total_sighting, na.rm = TRUE))
# All shearwater seen above a latitude of -30 (nearer the the equator ie 0) 888
seabirds %>%
group_by(common_name) %>%
mutate(max_sighting = max(num_group_sighting, na.rm = TRUE)) %>%
filter(max_sighting == 1) %>%
distinct(common_name)
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
# 23 birds only seen once
seabirds %>%
group_by(common_name) %>%
mutate(common_name = if_else(str_detect(common_name,
"(?i)shearwater"),"shearwater",
common_name),
common_name = if_else(str_detect(common_name,
"(?i)albatross"), "albatross",
common_name),
common_name = if_else(str_detect(common_name,
"(?i)mollymawk"), "mollymawk",
common_name),
common_name = if_else(str_detect(common_name,
"(?i)petrel"), "petrel",
common_name),
common_name = if_else(str_detect(common_name,
"(?i)prion"), "prion",
common_name),
common_name = if_else(str_detect(common_name,
"(?i)skua"), "skua",
common_name)
) %>%
mutate(max_sighting = max(num_group_sighting, na.rm = TRUE)) %>%
filter(max_sighting == 1) %>%
distinct(common_name)
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
# 2 individual species of birds only seen once,
# when the variations are combined as one group
seabirds %>%
group_by(common_name) %>%
filter(str_detect(common_name, "(?i)penguin")) %>%
ungroup() %>%
summarise(count = sum(total_sighting, na.rm = TRUE))
# 158 penguins